PCA

Question: should we correct for batch?

Batch correction for HeLa cells: definitely.
Batch correction for HEK cells: maybe not necessary. Let’s compare both DEAs.

DEA

#' bartelDEA
#'
#' Generates control samples out of the average values over all supplied samples,
#' calculates their logCPM & log2FC data and performs DEA over all treatments & individual ones. 
#' All is assembled into one SE object. 
#' 
#'
#' @param se SummarizedExperiment object containing assays of raw counts
#'
#' @return an SE object
#' 
bartelDEA <- function(se, model, model0=~1){

  se$Batch <- droplevels(se$Batch)
  
  # generate control
    
  ## create logcpm assay
  assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
  
  ## generate a 'control' sample out of the median normalized counts over all samples
  e.ctrl <- genCTRL(se, "logcpm")
  
  ## add control samples to assays & generate DGEList object
  dds <- DGEList(cbind(assay(se), e.ctrl))
  
  ## generate colData info for combined assay
  dd <- rbind(colData(se)[,c("Batch","miRNA","Cell_Line")], data.frame(Batch=unique(se$Batch),
                                                                       Cell_Line=unique(se$Cell_Line),
                                                                       miRNA="CTRL"))
  dd$miRNA <- relevel(droplevels(dd$miRNA), ref="CTRL")
  
  
  # DEA
  
  ## normalization
  dds <- calcNormFactors(dds)
  ## models
  mm <- model.matrix(model, data=dd)
  mm0 <- model.matrix(model0, data=dd)
  testCoeff <- setdiff(colnames(mm), colnames(mm0))
  ## estimate dispersion
  dds <- estimateDisp(dds,mm)
  ## fit negative binomial distribution on counts per gene (use glmFit for few replicates)
  fit <- glmFit(dds,mm)
  
  ## fit linear model on all samples
  res <- as.data.frame(topTags(glmLRT(fit, testCoeff),Inf))
  
  ## fit linear model dropping one sample at a time (using multiple cores)
  res.list <- bplapply( testCoeff, BPPARAM=MulticoreParam(8), FUN=function(x){
    as.data.frame(topTags(glmLRT(fit, x), Inf))
  })
  
  dea.names <- make.names(gsub("miRNA","", testCoeff))
  names(res.list) <- paste0("DEA.",dea.names)
  colnames(res)[grepl("logFC",colnames(res))] <- paste0("logFC.", dea.names)
  
  
  # generate final SE object
  
  ## SE object with logCPM & logFC assays & batch-corrected logCPM assays (for visualization)
  se <- SummarizedExperiment(assays=list(counts=dds$counts), 
                             rowData=rowData(se), 
                             colData=dd) 
  assays(se)$logcpm <- log1p(cpm(calcNormFactors(DGEList(assay(se)))))
  se <- plgINS::svacor(se, form = model, form0 = model0)
  se <- SEtools::log2FC(se, controls = se$miRNA=="CTRL", by = se$readtype, fromAssay = "corrected", isLog = TRUE)
  
  ## add DEAs
  rowData(se)$DEA.all <- DataFrame(res[rownames(se),])
  for(i in paste0("DEA.",dea.names)){
    rowData(se)[[i]] <- DataFrame(res.list[[i]][rownames(se),])
  }
  
  
  # output
  
  return(se)
}
## Number of significant surrogate variables is:  4 
## Iteration (out of 5 ):1  2  3  4  5
## Number of significant surrogate variables is:  2 
## Iteration (out of 5 ):1  2  3  4  5
## Number of significant surrogate variables is:  3 
## Iteration (out of 5 ):1  2  3  4  5

check results

##      DEA.all   DEA.let.7a    DEA.lsy.6    DEA.miR.1  DEA.miR.124  DEA.miR.137 
##        13872         3088         1817         5895         5111         3608 
##  DEA.miR.139  DEA.miR.143  DEA.miR.144  DEA.miR.153  DEA.miR.155  DEA.miR.182 
##         1242          461          298         2400         2194         2295 
## DEA.miR.199a  DEA.miR.204  DEA.miR.205 DEA.miR.216b  DEA.miR.223    DEA.miR.7 
##         1679          946         1207         1394         1705         3565
##      DEA.all  DEA.miR.122  DEA.miR.133  DEA.miR.138  DEA.miR.145  DEA.miR.184 
##        10504          461          587         3100          886          113 
## DEA.miR.190a DEA.miR.200b DEA.miR.216a  DEA.miR.217 DEA.miR.219a  DEA.miR.375 
##          748         1166          317          438          418          771 
## DEA.miR.451a 
##          137
##      DEA.all  DEA.miR.122  DEA.miR.133  DEA.miR.138  DEA.miR.145  DEA.miR.184 
##         8828          289          415         2051          451           84 
## DEA.miR.190a DEA.miR.200b DEA.miR.216a  DEA.miR.217 DEA.miR.219a  DEA.miR.375 
##          431          787          223          320          317          513 
## DEA.miR.451a 
##           90
## [1] 587
## [1] 415
##                b_in_nb   nb_in_b
## DEA.all      0.8272087 0.9842546
## DEA.miR.122  0.6268980 1.0000000
## DEA.miR.133  0.7069847 1.0000000
## DEA.miR.138  0.6609677 0.9990249
## DEA.miR.145  0.5090293 1.0000000
## DEA.miR.184  0.7433628 1.0000000
## DEA.miR.190a 0.5762032 1.0000000
## DEA.miR.200b 0.6749571 1.0000000
## DEA.miR.216a 0.7034700 1.0000000
## DEA.miR.217  0.7260274 0.9937500
## DEA.miR.219a 0.7559809 0.9968454
## DEA.miR.375  0.6653696 1.0000000
## DEA.miR.451a 0.6569343 1.0000000

We get more significant results with the batch corrected HEK DEA & almost all significant tx of the batch uncorrected HEK DEA are present in the corrected one. We thus choose the batch corrected version.